library(rpart)         # to train decision trees
library(rpart.plot)    # to plot decision trees
library(randomForest)  # random forests
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)           # boosting
## Loaded gbm 2.1.8
library(tidyverse)     # tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine()  masks randomForest::combine()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
## x ggplot2::margin() masks randomForest::margin()
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(cowplot)
library(ggplot2)
library(class)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-3
library(glmnetUtils)
## 
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
## 
##     cv.glmnet, glmnet
library(corrplot)
## corrplot 0.92 loaded
rawfa <- read_csv('Food Access Research Atlas.csv')
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 72531 Columns: 147
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (113): State, County, MedianFamilyIncome, LAPOP1_10, LAPOP05_10, LAPOP1_...
## dbl  (34): CensusTract, Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rawtract <- read_csv('pdb2019bgv6_us.csv')
## Rows: 220357 Columns: 346
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (22): GIDBG, State, State_name, County, County_name, Tract, Med_HHD_Inc...
## dbl (324): Block_group, Flag, LAND_AREA, AIAN_LAND, URBANIZED_AREA_POP_CEN_2...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
modtract <- rawtract[is.na(rawtract$Flag),] # removes uninhabitable areas
modtract1 <- modtract[!(is.na(modtract$Tot_Population_CEN_2010)|modtract$Tot_Population_CEN_2010==0),]
# removes areas with no residents

# Now, time to isolate the metrics that we actually care about
modtract2 <- modtract1 %>%
  select(GIDBG,Tract,Block_group,State_name,County_name,
         Tot_Population_CEN_2010,
         pct_URBAN_CLUSTER_POP_CEN_2010,
         pct_URBANIZED_AREA_POP_CEN_2010,
         pct_RURAL_POP_CEN_2010,
         pct_Males_CEN_2010,
         pct_Pop_under_5_CEN_2010,
         pct_Pop_5_17_CEN_2010,
         pct_Pop_18_24_CEN_2010,
         pct_Pop_25_44_CEN_2010,
         pct_Pop_45_64_CEN_2010,
         pct_Pop_65plus_CEN_2010,
         pct_Tot_GQ_CEN_2010,
         pct_Hispanic_CEN_2010,
         pct_NH_White_alone_CEN_2010,
         pct_NH_Blk_alone_CEN_2010,
         pct_NH_AIAN_alone_CEN_2010,
         pct_NH_Asian_alone_CEN_2010,
         pct_NH_NHOPI_alone_CEN_2010,
         pct_Not_HS_Grad_ACS_13_17,
         pct_College_ACS_13_17,
         pct_Prs_Blw_Pov_Lev_ACS_13_17,
         pct_No_Health_Ins_ACS_13_17,
         pct_Diff_HU_1yr_Ago_ACS_13_17,
         pct_Rel_Family_HHD_CEN_2010,
         pct_MrdCple_HHD_CEN_2010,
         pct_Female_No_HB_CEN_2010,
         pct_Sngl_Prns_HHD_CEN_2010,
         pct_HHD_PPL_Und_18_CEN_2010,
         avg_Tot_Prns_in_HHD_CEN_2010,
         pct_PUB_ASST_INC_ACS_13_17,
         avg_Agg_HH_INC_ACS_13_17,
         pct_Vacant_Units_CEN_2010,
         pct_Renter_Occp_HU_CEN_2010,
         pct_MLT_U2_9_STRC_ACS_13_17,
         pct_MLT_U10p_ACS_13_17,
         pct_Mobile_Homes_ACS_13_17,
         pct_NO_PH_SRVC_ACS_13_17,
         pct_No_Plumb_ACS_13_17,
         pct_Recent_Built_HU_ACS_13_17,
         avg_Agg_House_Value_ACS_13_17) %>%
  rename_with(tolower)
nrow(modtract2)-length(unique(modtract2$tract)) #There are 195505 repetitions of Tracts
## [1] 195505
# This repetition is caused by the fact there are multiple "blocks" within each Tract
# To be able to view the data at a Tract level (so we can combine with the Food Access data),
  # we need to group by Tract and find the weighted avg of the blocks' data for some variables
  # and the sum for others.

# Upon further research, there are also multiple tracts with the same number across states.
  # Thus, we will be using gidbg as an id for each census tract. This is more in line with
  # the other data set as well.

modtract3 <- modtract2 %>% separate(gidbg,into=c('gidbg','block'),sep=-1) %>% select(-block)

fintract <- na.omit(modtract3) %>%
  group_by(gidbg) %>%
  mutate(total_pop=sum(tot_population_cen_2010)) %>%
  mutate(pct_urban=weighted.mean(pct_urban_cluster_pop_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_urban_cluster_pop_cen_2010) %>%
  mutate(pct_urbanized=weighted.mean(pct_urbanized_area_pop_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_urbanized_area_pop_cen_2010) %>%
  mutate(pct_rural=weighted.mean(pct_rural_pop_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_rural_pop_cen_2010) %>%
  mutate(pct_male=weighted.mean(pct_males_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_males_cen_2010) %>%
  mutate(pct_under_5=weighted.mean(pct_pop_under_5_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_under_5_cen_2010) %>%
  mutate(pct_5_17=weighted.mean(pct_pop_5_17_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_5_17_cen_2010) %>%
  mutate(pct_18_24=weighted.mean(pct_pop_18_24_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_18_24_cen_2010) %>%
  mutate(pct_25_44=weighted.mean(pct_pop_25_44_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_25_44_cen_2010) %>%
  mutate(pct_45_64=weighted.mean(pct_pop_45_64_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_45_64_cen_2010) %>%
  mutate(pct_65plus=weighted.mean(pct_pop_65plus_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_pop_65plus_cen_2010) %>%
  mutate(pct_group_quarters=weighted.mean(pct_tot_gq_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_tot_gq_cen_2010) %>%
  mutate(pct_hispanic=weighted.mean(pct_hispanic_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_hispanic_cen_2010) %>%
  mutate(pct_white=weighted.mean(pct_nh_white_alone_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_nh_white_alone_cen_2010) %>%
  mutate(pct_black=weighted.mean(pct_nh_blk_alone_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_nh_blk_alone_cen_2010) %>%
  mutate(pct_aian=weighted.mean(pct_nh_aian_alone_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_nh_aian_alone_cen_2010) %>%
  mutate(pct_asian=weighted.mean(pct_nh_asian_alone_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_nh_asian_alone_cen_2010) %>%
  mutate(pct_nhopi=weighted.mean(pct_nh_nhopi_alone_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_nh_nhopi_alone_cen_2010) %>%
  mutate(pct_not_hs_grad=weighted.mean(pct_not_hs_grad_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_not_hs_grad_acs_13_17) %>%
  mutate(pct_college=weighted.mean(pct_college_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_college_acs_13_17) %>%
  mutate(pct_blw_pov=weighted.mean(pct_prs_blw_pov_lev_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_prs_blw_pov_lev_acs_13_17) %>%
  mutate(pct_no_h_ins=weighted.mean(pct_no_health_ins_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_no_health_ins_acs_13_17) %>%
  mutate(pct_moved=weighted.mean(pct_diff_hu_1yr_ago_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_diff_hu_1yr_ago_acs_13_17) %>%
  mutate(pct_family_hhd=weighted.mean(pct_rel_family_hhd_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_rel_family_hhd_cen_2010) %>%
  mutate(pct_married_hhd=weighted.mean(pct_mrdcple_hhd_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_mrdcple_hhd_cen_2010) %>%
  mutate(pct_single_female=weighted.mean(pct_female_no_hb_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_female_no_hb_cen_2010) %>%
  mutate(pct_solo_res=weighted.mean(pct_sngl_prns_hhd_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_sngl_prns_hhd_cen_2010) %>%
  mutate(pct_hhd_children=weighted.mean(pct_hhd_ppl_und_18_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_hhd_ppl_und_18_cen_2010) %>%
  mutate(avg_ppl_hhd=weighted.mean(avg_tot_prns_in_hhd_cen_2010,tot_population_cen_2010)) %>%
  select(-avg_tot_prns_in_hhd_cen_2010) %>%
  mutate(pct_pub_assist=weighted.mean(pct_pub_asst_inc_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_pub_asst_inc_acs_13_17) %>%
  mutate(interim_income=as.numeric(gsub("[\\$,]", "", avg_agg_hh_inc_acs_13_17))) %>%
  # ^ adjusting to be numeric
  mutate(avg_hhd_income=weighted.mean(interim_income,tot_population_cen_2010)) %>%
  select(-avg_agg_hh_inc_acs_13_17) %>%
  select(-interim_income) %>%
  mutate(pct_vacant_units=weighted.mean(pct_vacant_units_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_vacant_units_cen_2010) %>%
  mutate(pct_vacant_units=weighted.mean(pct_renter_occp_hu_cen_2010,tot_population_cen_2010)) %>%
  select(-pct_renter_occp_hu_cen_2010) %>%
  mutate(pct_multi_2_9=weighted.mean(pct_mlt_u2_9_strc_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_mlt_u2_9_strc_acs_13_17) %>%
  mutate(pct_multi_10plus=weighted.mean(pct_mlt_u10p_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_mlt_u10p_acs_13_17) %>%
  mutate(pct_mobile_homes=weighted.mean(pct_mobile_homes_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_mobile_homes_acs_13_17) %>%
  mutate(pct_no_service=weighted.mean(pct_no_ph_srvc_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_no_ph_srvc_acs_13_17) %>%
  mutate(pct_no_plumbing=weighted.mean(pct_no_plumb_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_no_plumb_acs_13_17) %>%
  mutate(pct_new_house=weighted.mean(pct_recent_built_hu_acs_13_17,tot_population_cen_2010)) %>%
  select(-pct_recent_built_hu_acs_13_17) %>%
  mutate(interim_house_value=as.numeric(gsub("[\\$,]", "", avg_agg_house_value_acs_13_17))) %>%
  # ^ adjusting to be numeric
  mutate(avg_house_value=weighted.mean(interim_house_value,tot_population_cen_2010)) %>%
  select(-avg_agg_house_value_acs_13_17) %>%
  select(-interim_house_value) %>%
  # now, to take out some columns we only still needed for computation
  select(-c(tot_population_cen_2010,block_group,tract)) %>%
  distinct(gidbg,total_pop,pct_urban,.keep_all=TRUE)
rawfa1 <- rawfa %>% rename_with(tolower) %>%           # selecting only important variables 
  select(censustract,state,county,la1and10,lahalfand10,       # (both explanatory and response)
           la1and20,latracts_half,latracts1,latracts10,
           latracts20,latractsvehicle_20,lapophalfshare,
           lakidshalf,laseniorshalf,lahunvhalfshare,lapop1share,
           lakids1,laseniors1,lahunv1share,lapop10share,lakids10,
           laseniors10,lahunv10share,lapop20share,lakids20,
           laseniors20,lahunv20share,tractkids,tractseniors,tractsnap,
           pop2010)

# the data only gave the # of kids/seniors with low access, as well as what percent of the
# total population they comprise, but not what percent OF kids/seniors are low access

finfa <- rawfa1 %>%
  mutate(snap_share=tractsnap/pop2010*100) %>%              # calculating % on SNAP
  select(-c(tractsnap,pop2010)) %>%
  mutate(tot_share_half=as.numeric(lapophalfshare)) %>% # making columns not needing
  select(-lapophalfshare) %>%                           # changing numeric
  mutate(tot_share_1=as.numeric(lapop1share)) %>%       # also manages order
  select(-lapop1share) %>%
  mutate(tot_share_10=as.numeric(lapop10share)) %>%
  select(-lapop10share) %>%
  mutate(tot_share_20=as.numeric(lapop20share)) %>%
  select(-lapop20share) %>%
  mutate(no_vehic_half=as.numeric(lahunvhalfshare)) %>%
  select(-lahunvhalfshare) %>%
  mutate(no_vehic_1=as.numeric(lahunv1share)) %>%
  select(-lahunv1share) %>%
  mutate(no_vehic_10=as.numeric(lahunv10share)) %>%
  select(-lahunv10share) %>%
  mutate(no_vehic_20=as.numeric(lahunv20share)) %>%
  select(-lahunv20share) %>%
  mutate(kids_share_half=as.numeric(lakidshalf)/tractkids) %>%          #changes
  select(-lakidshalf) %>%
  mutate(seniors_share_half=as.numeric(laseniorshalf)/tractseniors) %>%
  select(-laseniorshalf) %>%
  mutate(kids_share_1=as.numeric(lakids1)/tractkids) %>%
  select(-lakids1) %>%
  mutate(seniors_share_1=as.numeric(laseniors1)/tractseniors) %>%
  select(-laseniors1) %>%
  mutate(kids_share_10=as.numeric(lakids10)/tractkids) %>%
  select(-lakids10) %>%
  mutate(seniors_share_10=as.numeric(laseniors10)/tractseniors) %>%
  select(-laseniors10) %>%
  mutate(kids_share_20=as.numeric(lakids20)/tractkids) %>%
  select(-lakids20) %>%
  mutate(seniors_share_20=as.numeric(laseniors20)/tractseniors) %>%
  select(-laseniors20) %>%
  select(-c(tractkids,tractseniors,state,county))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
finfa$censustract <- as.numeric(finfa$censustract)
fintract$gidbg <- as.numeric(fintract$gidbg)
finfa <- finfa %>% rename(tract=censustract)
fintract <- fintract %>% rename(tract=gidbg)

us_food_access <- inner_join(fintract,finfa,by='tract')
us_food_access <- us_food_access[!(us_food_access$tract==46102940900 |
                   us_food_access$tract==2158000100 |
                   us_food_access$tract==46102940500),]

Analysis

set.seed(250)
train_samples <- sample(1:nrow(us_food_access),0.8*nrow(us_food_access))
food_train <- us_food_access[train_samples,]
food_test <- us_food_access[-train_samples,]

glm_fit_10 <- glm(latracts10 ~ 
                   pct_urban+pct_urbanized+pct_rural+
                   pct_male+pct_under_5+
                   pct_5_17+pct_18_24+
                   pct_25_44+pct_45_64+
                   pct_65plus+pct_group_quarters+
                   pct_hispanic+pct_white+
                   pct_black+pct_aian+
                   pct_asian+pct_nhopi+
                   pct_not_hs_grad+pct_college+
                   pct_blw_pov+pct_no_h_ins+
                   pct_moved+pct_family_hhd+
                   pct_single_female+
                   pct_solo_res+pct_hhd_children+
                   avg_ppl_hhd+pct_pub_assist+
                   avg_hhd_income+
                   pct_vacant_units+
                   pct_multi_2_9+
                   pct_multi_10plus+
                   pct_mobile_homes+
                   pct_no_service+
                   pct_no_plumbing+pct_new_house+
                   avg_house_value+snap_share,
                   family='binomial',
                 data=food_train)
summary(glm_fit_10)
## 
## Call:
## glm(formula = latracts10 ~ pct_urban + pct_urbanized + pct_rural + 
##     pct_male + pct_under_5 + pct_5_17 + pct_18_24 + pct_25_44 + 
##     pct_45_64 + pct_65plus + pct_group_quarters + pct_hispanic + 
##     pct_white + pct_black + pct_aian + pct_asian + pct_nhopi + 
##     pct_not_hs_grad + pct_college + pct_blw_pov + pct_no_h_ins + 
##     pct_moved + pct_family_hhd + pct_single_female + pct_solo_res + 
##     pct_hhd_children + avg_ppl_hhd + pct_pub_assist + avg_hhd_income + 
##     pct_vacant_units + pct_multi_2_9 + pct_multi_10plus + pct_mobile_homes + 
##     pct_no_service + pct_no_plumbing + pct_new_house + avg_house_value + 
##     snap_share, family = "binomial", data = food_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5508  -0.0417  -0.0091  -0.0042   4.0012  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.753e+04  1.435e+04  -1.222 0.221727    
## pct_urban           1.702e+02  1.434e+02   1.187 0.235178    
## pct_urbanized       1.702e+02  1.434e+02   1.187 0.235264    
## pct_rural           1.703e+02  1.434e+02   1.187 0.235076    
## pct_male            2.245e-01  2.263e-02   9.923  < 2e-16 ***
## pct_under_5         5.195e+00  5.607e+00   0.927 0.354138    
## pct_5_17            5.029e+00  5.606e+00   0.897 0.369725    
## pct_18_24           4.843e+00  5.606e+00   0.864 0.387557    
## pct_25_44           4.772e+00  5.606e+00   0.851 0.394626    
## pct_45_64           4.877e+00  5.606e+00   0.870 0.384357    
## pct_65plus          4.892e+00  5.606e+00   0.873 0.382883    
## pct_group_quarters -2.161e-02  1.192e-02  -1.812 0.069935 .  
## pct_hispanic        7.385e-02  2.414e-02   3.059 0.002221 ** 
## pct_white           5.059e-02  2.391e-02   2.116 0.034373 *  
## pct_black           9.979e-02  2.453e-02   4.069 4.73e-05 ***
## pct_aian            1.074e-01  2.587e-02   4.149 3.33e-05 ***
## pct_asian           9.916e-02  3.840e-02   2.582 0.009820 ** 
## pct_nhopi           2.323e-01  8.157e-02   2.848 0.004398 ** 
## pct_not_hs_grad    -1.490e-02  6.273e-03  -2.376 0.017523 *  
## pct_college        -7.181e-03  5.346e-03  -1.343 0.179201    
## pct_blw_pov         8.457e-03  5.989e-03   1.412 0.157886    
## pct_no_h_ins        1.130e-03  6.014e-03   0.188 0.850998    
## pct_moved           8.725e-03  6.380e-03   1.368 0.171442    
## pct_family_hhd      6.418e-02  2.384e-02   2.692 0.007093 ** 
## pct_single_female  -1.865e-01  1.779e-02 -10.484  < 2e-16 ***
## pct_solo_res        6.858e-02  2.801e-02   2.448 0.014346 *  
## pct_hhd_children    2.268e-02  2.164e-02   1.048 0.294684    
## avg_ppl_hhd        -3.087e+00  4.460e-01  -6.922 4.47e-12 ***
## pct_pub_assist      2.525e-02  1.459e-02   1.730 0.083580 .  
## avg_hhd_income     -1.065e-05  3.093e-06  -3.444 0.000574 ***
## pct_vacant_units    1.583e-02  6.057e-03   2.614 0.008951 ** 
## pct_multi_2_9      -4.308e-02  9.864e-03  -4.367 1.26e-05 ***
## pct_multi_10plus   -6.160e-02  1.484e-02  -4.150 3.32e-05 ***
## pct_mobile_homes    1.718e-02  2.798e-03   6.139 8.29e-10 ***
## pct_no_service     -1.201e-02  1.113e-02  -1.079 0.280524    
## pct_no_plumbing     8.188e-02  6.457e-03  12.680  < 2e-16 ***
## pct_new_house       4.328e-02  2.096e-02   2.065 0.038880 *  
## avg_house_value    -5.415e-07  5.412e-07  -1.001 0.317033    
## snap_share         -8.014e-02  1.597e-02  -5.019 5.18e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20675.1  on 56116  degrees of freedom
## Residual deviance:  9624.7  on 56078  degrees of freedom
## AIC: 9702.7
## 
## Number of Fisher Scoring iterations: 11
fitted_probabilities <- predict(glm_fit_10,
        newdata=food_test,type="response")
hist(fitted_probabilities)

predictions <- fitted_probabilities>0.5
food_test <- food_test %>%
  ungroup() %>%
  mutate(predicted_flag=predictions)
# misclassification rate
food_test %>% summarise(mean(latracts10!=predicted_flag))
food_test %>%
  select(latracts10,predicted_flag) %>%
  table()
##           predicted_flag
## latracts10 FALSE  TRUE
##          0 13314   105
##          1   387   224
C_FN<- 53000000000/nrow(us_food_access)+1000000 # total cost of food inequity / # of tracts
C_FP <- 500000 # cost of building supermarket
thresh <- C_FP/(C_FP+C_FN)
thresh
## [1] 0.2216748
predictions <- as.numeric(fitted_probabilities>thresh)
food_test %>%
  mutate(predicted_flag=predictions) %>%
  select(latracts10,predicted_flag) %>%
  table()
##           predicted_flag
## latracts10     0     1
##          0 12845   574
##          1   142   469
# ROC curve
roc_log <- roc(food_test %>% pull(latracts10),
               fitted_probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
fpr <- 1-roc_log$specificities
tpr <- roc_log$sensitivities
tibble(fpr,tpr) %>%
  ggplot(aes(x=fpr,y=tpr)) +
  geom_line() +
  geom_abline(slope=1,linetype='dashed') +
  theme_bw()

roc_log$auc
## Area under the curve: 0.9698
food_train %>%
  ggplot(aes(x=pct_college,y=latracts10)) +
  geom_jitter(height=0.05) +
  geom_smooth(method='glm',
              formula='y~x',
              method.args=list(family='binomial'),
              se=F) +
  ylab("Prob(flagged for having low food access") +
  geom_hline(yintercept=thresh,linetype='dashed',color='red')

  theme_bw()
## List of 93
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                     : NULL
##  $ aspect.ratio              : NULL
##  $ axis.title                : NULL
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom       : NULL
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left         : NULL
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom        : NULL
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left          : NULL
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.x              : NULL
##  $ axis.ticks.x.top          : NULL
##  $ axis.ticks.x.bottom       : NULL
##  $ axis.ticks.y              : NULL
##  $ axis.ticks.y.left         : NULL
##  $ axis.ticks.y.right        : NULL
##  $ axis.ticks.length         : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.x.top           : NULL
##  $ axis.line.x.bottom        : NULL
##  $ axis.line.y               : NULL
##  $ axis.line.y.left          : NULL
##  $ axis.line.y.right         : NULL
##  $ legend.background         :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing            : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size           : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "right"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.just           : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ panel.background          :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border              :List of 5
##   ..$ fill         : logi NA
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.spacing             : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major          : NULL
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major.x        : NULL
##  $ panel.grid.major.y        : NULL
##  $ panel.grid.minor.x        : NULL
##  $ panel.grid.minor.y        : NULL
##  $ panel.ontop               : logi FALSE
##  $ plot.background           :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title.position       : chr "panel"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption.position     : chr "panel"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ strip.background          :List of 5
##   ..$ fill         : chr "grey85"
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.background.x        : NULL
##  $ strip.background.y        : NULL
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.switch.pad.grid     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.switch.pad.wrap     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.text.y.left         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE
train1 <- food_train %>% ungroup() %>% select(pct_urban,
                   pct_urbanized,pct_rural,
                   pct_male,pct_under_5,
                   pct_5_17,pct_18_24,
                   pct_25_44,pct_45_64,
                   pct_65plus,pct_group_quarters,
                   pct_hispanic,pct_white,
                   pct_black,pct_aian,
                   pct_asian,pct_nhopi,
                   pct_not_hs_grad,pct_college,
                   pct_blw_pov,pct_no_h_ins,
                   pct_moved,pct_family_hhd,
                   pct_single_female,
                   pct_solo_res,pct_hhd_children,
                   avg_ppl_hhd,pct_pub_assist,
                   avg_hhd_income,
                   pct_vacant_units,
                   pct_multi_2_9,
                   pct_multi_10plus,
                   pct_mobile_homes,
                   pct_no_service,
                   pct_no_plumbing,pct_new_house,
                   avg_house_value,snap_share) 
test1 <- food_test %>% select(pct_urban,
                   pct_urbanized,pct_rural,
                   pct_male,pct_under_5,
                   pct_5_17,pct_18_24,
                   pct_25_44,pct_45_64,
                   pct_65plus,pct_group_quarters,
                   pct_hispanic,pct_white,
                   pct_black,pct_aian,
                   pct_asian,pct_nhopi,
                   pct_not_hs_grad,pct_college,
                   pct_blw_pov,pct_no_h_ins,
                   pct_moved,pct_family_hhd,
                   pct_single_female,
                   pct_solo_res,pct_hhd_children,
                   avg_ppl_hhd,pct_pub_assist,
                   avg_hhd_income,
                   pct_vacant_units,
                   pct_multi_2_9,
                   pct_multi_10plus,
                   pct_mobile_homes,
                   pct_no_service,
                   pct_no_plumbing,pct_new_house,
                   avg_house_value,snap_share)

knn_results <- knn(
  train=train1,
  test= test1,
  cl = food_train$latracts10,
  k=10,
  prob=TRUE
)

# adding results to test tibble
food_test <- food_test %>%
  ungroup() %>%
  mutate(prediction=as.numeric(knn_results == 1),
         probability = attributes(knn_results)$prob,
         probability = ifelse(prediction==1, probability, 1-probability))
  
#plotting the results
food_test %>%
  ggplot(aes(x=pct_college)) +
  geom_jitter(aes(y = latracts10, colour = as.factor(prediction)),
              width = 0, height = 0.1) +
  geom_smooth(aes(y = probability)) +
  scale_y_continuous(breaks = c(0,1)) +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# performance metrics of knn
conf_matrix_knn <- food_test %>%
  select(latracts10,prediction) %>%
  table()
conf_matrix_knn
##           prediction
## latracts10     0     1
##          0 13414     5
##          1   610     1
# not possible to find false positive/false negative rate because there are no
# predicted flags in this model
food_test %>%
  summarise(weighted_error = mean(C_FP*(prediction == 1 & latracts10 == 0) +
                                  C_FN*(prediction == 0 & latracts10 == 1)))
# this all comes from the cost of incorrectly predicting a tract should not
# be flagged
# ROC curve

roc_knn <- roc(food_test %>% pull(latracts10),
                food_test %>% pull(probability))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
tibble(FPR_knn = 1-roc_knn$specificities,
       TPR_knn = roc_knn$sensitivities) %>%
  ggplot(aes(x=FPR_knn,y=TPR_knn)) +
  geom_line() +
  geom_abline(slope = 1, linetype = "dashed") +
  xlim(0,1)+
  geom_point(x=0,y=roc_knn$sensitivities[length(roc_knn)], # FPR = 0
             color='red') +
  theme_bw()
## Warning: Removed 11 rows containing missing values (geom_point).

roc_knn$auc
## Area under the curve: 0.6224
num_thresholds = 100
thresholds = seq(0,1,length.out=num_thresholds) 
weighted_errors = numeric(num_thresholds)
for(threshold_idx in 1:num_thresholds){
  threshold = thresholds[threshold_idx]
  weighted_errors[threshold_idx] =
    food_test %>%
    mutate(prediction = probability >= threshold) %>%
    summarise(weighted_error = mean(C_FP*(prediction == 1&latracts10==0) +
                                      C_FN*(prediction==0 &latracts10==1))) %>%
    pull()
}

tibble(threshold = thresholds, weighted_error = weighted_errors) %>%
  ggplot(aes(x = threshold, y = weighted_error)) +
  geom_line() +
  geom_vline(xintercept = c(C_FP/(C_FP + C_FN), 0.5),
             linetype = "dashed", colour = "red") +
  labs(x = "KNN probability threshold", y = "Average cost per transaction") +
  theme_bw()

food_test_weighted <- food_test %>%
  mutate(prediction = probability >= C_FP/(C_FP + C_FN))
food_test_weighted %>%
  ggplot(aes(x = pct_college)) +
  geom_jitter(aes(y = latracts10, colour = prediction), width = 0, height = 0.1) +
  geom_smooth(aes(y = probability)) +
  geom_hline(yintercept = c(0.5, C_FP/(C_FP + C_FN)),
           linetype = "dashed", colour = "red") +
  scale_y_continuous(breaks = c(0,1)) +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

tibble(
  TP=sum(as.numeric(food_test_weighted$prediction)==1&food_test_weighted$latracts10==1),
  FP=sum(as.numeric(food_test_weighted$prediction)==1&food_test_weighted$latracts10==0),
  FN=sum(as.numeric(food_test_weighted$prediction)==0&food_test_weighted$latracts10==1),
  TN=sum(as.numeric(food_test_weighted$prediction)==0&food_test_weighted$latracts10==0))

Ridge and Lasso Regressions

source('/Users/shyankoul/Documents/Stat-471/stat-471-fall-2021/functions/plot_glmnet.R')
source('/Users/shyankoul/Documents/Stat-471/stat-471-fall-2021/functions/cross_validate_spline.R')

train2 <- train1 %>% ungroup() %>%
  mutate(latracts10=food_train$latracts10)
ridge_fit <- cv.glmnet(latracts10 ~ .,
                       alpha=0,
                       nfolds=10,
                       family='binomial',
                       type.measure='class',
                       data=train2)
plot(ridge_fit)

plot_glmnet(ridge_fit,train2,features_to_plot=10)

probabilities <- predict(ridge_fit,
                         newdata=test1,
                         s='lambda.1se',
                         type='class') %>%
  as.numeric()
table(food_test$latracts10,probabilities)
##    probabilities
##         0     1
##   0 13390    29
##   1   555    56
lasso_fit <- cv.glmnet(latracts10 ~ .,
                       alpha=1,
                       nfolds=10,
                       family='binomial',
                       type.measure='class',
                       data=train2)
plot(lasso_fit)

plot_glmnet(lasso_fit,train2,features_to_plot=10)

prob_lasso <- predict(lasso_fit,
                      newdata=test1,
                      s='lambda.1se',
                      type='class') %>%
  as.numeric()
table(food_test$latracts10,prob_lasso)
##    prob_lasso
##         0     1
##   0 13335    84
##   1   431   180

Decision Tree Models

rf_fit3 <- randomForest(as.factor(as.character(latracts10)) ~ .,
                        mtry=3,
                       data=train2)
rf_fit9 <- randomForest(as.factor(as.character(latracts10)) ~ .,
                        mtry=9,
                       data=train2)
rf_fit19 <- randomForest(as.factor(as.character(latracts10)) ~ .,
                        mtry=19,
                       data=train2)
rf_fit38 <- randomForest(as.factor(as.character(latracts10)) ~ .,
                        mtry=38,
                       data=train2)
oob_errors = bind_rows(
  tibble(ntree = 1:500, oob_err = rf_fit3$err.rate[,1], m = 3),
  tibble(ntree = 1:500, oob_err = rf_fit9$err.rate[,1], m = 9),
  tibble(ntree = 1:500, oob_err = rf_fit19$err.rate[,1], m = 19),
  tibble(ntree = 1:500, oob_err = rf_fit38$err.rate[,1], m = 38)
)
oob_errors
 oob_errors %>%
  ggplot(aes(x = ntree, y = oob_err, colour = factor(m))) +
  geom_line() + theme_bw()

head(rf_fit9$importance,5)
##               MeanDecreaseGini
## pct_urban             98.75713
## pct_urbanized        215.11152
## pct_rural            610.38799
## pct_male             166.14073
## pct_under_5          102.21102
varImpPlot(rf_fit9)

rf_predictions <- as.numeric(as.character(predict(rf_fit9,newdata=food_test)))
mean((rf_predictions-food_test$latracts10)^2)
## [1] 0.03442623
gbm_fit1 = gbm(latracts10 ~ .,
              distribution = "bernoulli",
              n.trees = 300,
              interaction.depth = 1,
              shrinkage = 0.1,
              cv.folds = 5,
              data = train2)
gbm_fit2 = gbm(latracts10 ~ .,
              distribution = "bernoulli",
              n.trees = 300,
              interaction.depth = 2,
              shrinkage = 0.1,
              cv.folds = 5,
              data = train2)
gbm_fit3 = gbm(latracts10 ~ .,
              distribution = "bernoulli",
              n.trees = 300,
              interaction.depth = 3,
              shrinkage = 0.1,
              cv.folds = 5,
              data = train2)

gbm_fit5 = gbm(latracts10 ~ .,
              distribution = "bernoulli",
              n.trees = 300,
              interaction.depth = 5,
              shrinkage = 0.1,
              cv.folds = 5,
              data = train2)

ntrees <- 300
cv_errors_boost <- bind_rows(
  tibble(ntree = 1:ntrees, cv_err = gbm_fit1$cv.error, depth = 1),
  tibble(ntree = 1:ntrees, cv_err = gbm_fit2$cv.error, depth = 2),
  tibble(ntree = 1:ntrees, cv_err = gbm_fit3$cv.error, depth = 3),
  tibble(ntree = 1:ntrees, cv_err = gbm_fit5$cv.error, depth = 5))

cv_errors_boost %>%
  ggplot(aes(x = ntree, y = cv_err, colour = factor(depth))) +
  geom_line() + theme_bw()

optimal_num_trees = gbm.perf(gbm_fit5, plot.it = FALSE)
summary(gbm_fit3,n.trees=optimal_num_trees,plotit=FALSE)
plot(gbm_fit5, i.var = "pct_rural", n.trees = optimal_num_trees, type = "response")

plot(gbm_fit3, i.var = "pct_no_plumbing", n.trees = optimal_num_trees, type = "response")

corrplot.mixed(cor(train1),
               lower = "number", 
               upper = "circle",
               tl.col = "black")